---
title: "Chang_Wang_2023_The_Reach_of_the_State_replication_code"
author: "Charles Chang and Yuhua Wang"
date: "2023-07-05"
output: html_document
---

################################################################
#Replication#
#The Reach of the State#

#Comparative Political Studies#

#Charles Chang#
#charles.c.chang@dukekunshan.edu.cn#

#Yuhua Wang#
#yuhuawang@fas.harvard.edu#

#R version 4.3.1#
################################################################

################################################################
#This R script includes codes that replicate all tables and figures in the main text and Online Appendix, except Tables A1-3, A1-4, and A1-6, which were produced in Stata#

#For Tables A1-3, A1-4, and A1-6, please use Tables A1-3 and A1-4 and A1-6.do#
################################################################

## Table 1: Categories of State Agencies in the Amap 2018 Database

Note: An account for Google Cloud Console is required to access raw data in this list. 
We have also provided a tutorial to assist you with basic queries and analyses using BigQuery. 

Data Table: [gaode2018](console.cloud.google.com/bigquery?ws=!1m5!1m4!4m3!1svigilant-balm-122322!2sPOIChina!3sgaode2018)


## Figure1: Density of State Agencies across Chinese Counties (2018)

Data Table: [gaode2018](console.cloud.google.com/bigquery?ws=!1m5!1m4!4m3!1svigilant-balm-122322!2sPOIChina!3sgaode2018)


## Figure 2: Economies of Scale in State Administration across Chinese Counties

```{r}

my_log <- file("./chang_wang_2023_console_log.txt") # File name of output log
sink(my_log, append = TRUE, type = "output") # Writing console output to log file
sink(my_log, append = TRUE, type = "message")

cat(readChar(rstudioapi::getSourceEditorContext()$path, # Writing currently opened R script to file
             file.info(rstudioapi::getSourceEditorContext()$path)$size))

rm(list = ls())
library(foreign) 
library(ggplot2) 
require(ggplot2)
library(ggrepel) 
require(gridExtra)
library(tidyverse)

figure2 <- read.dta("./county 2018 gaode data for R.dta")  # what is this file?

ggplot(figure2, aes(x=pop_log, y=state_agencies_pc_log)) + 
  geom_point(shape=19, color="violetred4", size=3)+
  geom_smooth(method=lm,  se=TRUE,linetype="solid",
              color="deepskyblue4", fill="deepskyblue2")+
  scale_y_continuous(name="Per Capita State Agencies (log)",limits=c(4,9), breaks=c(4,5,6,7,8,9))+
  scale_x_continuous(name="Population (log)", limits=c(-4,1), breaks=c(-4,-3,-2,-1,0,1))+
  annotate("text", x=0, y=8.3, label= "Slope = -0.302") + 
  theme_bw()

```

## Figure 3: Correlations between Amap 2018 and Data from Other Years and Sources

```{r}
# remove all the existing variables from R working environment
rm(list=ls(all=TRUE))

# load Gaode 2018 data
load("./gaode18.Rda")

# load Gaode 2017 data
load("./gaode17.Rda")

# load Gaode 2016 data
load("./gaode16.Rda")

# load Baidu 2015 data
load("./baidu15.Rda")

# load Land use data in 2018
load("./lu18.Rda")

# load Chinese Census data circa 2010
load("./Census2010.Rda")


library("readxl")

census2010 <- read_excel("./2010CountyCensusL.xlsx", sheet = "Sheet1")

library(dplyr)


# select POIs of government agencies

GovPOI <- names(poi_data)[246:286]
GovPOI

governmentPOIs <- c("X130105","X130106","X130501","X130502","X130503","X130505","X130506","X130701","X130702","X130703")

IDs <- names(poi_data)[0:3]

Census <- names(poi_data)[4:26]

# Sum up government POIs into one column
poi_data <- dplyr::mutate(poi_data, govs18 = rowSums(dplyr::select(poi_data, all_of(governmentPOIs))))

# replace administrative units without POIs with 0
poi_data[poi_data == 0] <- NA


# Government POIs by Million population. Some districts/counties do not have population data
poi_data <- transform(poi_data, govpermillion = govs18 / pop_million)

# saveRDS(poi_data, file = "./poi_data.rds")

# Government POIs by population using 2010 census with almost no missing data
gd18 <- merge(poi_data, data, by.x="gbcounty", by.y="GbCounty")
gd18 <- transform(gd18, govperpop = govs18 / A100001)

# saveRDS(gd18, file = "./gd18.rds")

# Gaode 17
gd17 <- dplyr::mutate(gd17, govs17 = rowSums(dplyr::select(gd17, all_of(governmentPOIs))))

gd17[gd17 == 0] <- NA

gd17 <- transform(gd17, govpermillion17 = govs17 / pop_million)

## Gaode 16

gd16 <- dplyr::mutate(gd16, govs16 = rowSums(dplyr::select(gd16, all_of(governmentPOIs))))

gd16[gd16 == 0] <- NA

gd16 <- transform(gd16, govpermillion16 = govs16 / pop_million)

## Baidu 15

baidu15 <- merge(bd15, dplyr::select(all_of(poi_data), c("gbcounty", Census)), by.x="gbcounty", by.y="gbcounty")

baidu15 <- transform(baidu15, govpermillion15 = numpts / pop_million )

## census and land use
otherdata <- merge(lu_data, census2010, by.x="gbcounty", by.y="GbCounty")

## Land use
LUtypes <- names(lu_data)[27:37]

lu_data[lu_data == 0] <- NA

lu_data <- transform(lu_data, govpermillionlu = X501 / (100000*pop_million))
lu_data <- transform(lu_data, govlu = X501)
lu_data <- transform(lu_data, govtranslu = X501 + X402 + X403)
lu_data <- transform(lu_data, govtranspermillionlu = (X501 + X402 + X403) / (100000*pop_million))

# merge two data frames for correlation by gbcounty
data2 <- merge(dplyr::select(all_of(poi_data), c("gbcounty", "govs18", "pop_million")), dplyr::select(lu_data, all_of(c("gbcounty", "govpermillionlu", "govlu", "govtranspermillionlu", "govtranslu"))), by.x="gbcounty", by.y="gbcounty")
 
data3 <- merge(dplyr::select(all_of(poi_data), c("gbcounty", "govpermillion")), dplyr::select(gd17, all_of(c("gbcounty", "govpermillion17", "govs17"))), by.x="gbcounty", by.y="gbcounty")

data4 <- merge(dplyr::select(all_of(poi_data), c("gbcounty", "govpermillion")), dplyr::select(gd16, all_of(c("gbcounty", "govpermillion16", "govs16"))), by.x="gbcounty", by.y="gbcounty")

data5 <- merge(dplyr::select(all_of(poi_data), c("gbcounty", "govpermillion")), dplyr::select(baidu15, all_of(c("gbcounty", "govpermillion15", "numpts"))), by.x="gbcounty", by.y="gbcounty")

data6 <- merge(dplyr::select(all_of(poi_data), c("gbcounty")), dplyr::select(otherdata, all_of(c("gbcounty", "L600021"))), by.x="gbcounty", by.y="gbcounty")

# Merge all data
# df_list <- list(data2, data3, data4, data5, data6)
# 
# # merge all data frames in list
# df_combine <- df_list %>% reduce(full_join, by='gbcounty')

data23 <- merge(data2, data3, by.x="gbcounty", by.y="gbcounty")
data234 <- merge(data23, data4, by.x="gbcounty", by.y="gbcounty")
data2345 <- merge(data234, data5, by.x="gbcounty", by.y="gbcounty")
data23456 <- merge(data2345, data6, by.x="gbcounty", by.y="gbcounty") 

# save(data23456,file="data_merge.Rda")

library(corrplot)
library(ggplot2)
library("Hmisc")
library(GGally)
library("PerformanceAnalytics")
library(psych)

# select a subset of variables for plotting
Fig3.dat <- dplyr::select(data23456, all_of(c("govs18", "govs17", "govs16", "numpts", "L600021")))

Fig3.dat[Fig3.dat == 0] <- NA

# summary(Fig3.dat)


# Plotting
# pdf("./Figure3.pdf")
# ggpairs(A1.dat, title="correlogram with ggpairs()") 
g <- ggpairs(Fig3.dat,columnLabels = c("State 2018 (Amap)", "State 2017 (Amap)","State 2016 (Amap)","State 2015 (Baidu)",  "State Employees 2010"), axisLabels = "none")


print(g)
# dev.off()



```


## Figure 5: Counties with New Stability Maintenance Agencies in 2016


```{r}

# See Counties_with_New_Stability_Maintenance_Agencies.csv
# countyid denotes the Chinese administrative divisions code at county level

```

## Figure 6: Number of Protests across Chinese Counties in 2015 and 2017


```{r}
# see 2014_protests_for_maps.csv and 2016_protests_for_maps.csv
```

## Figure 7: DID Plot

```{r}
# remove all the existing variables from R working environment
rm(list=ls(all=TRUE))

library(foreign)
# Need to add the data in the Dropbox folder
figure7 <- read.dta("./DiD graph data.dta")

attach(figure7)

figure7$treat <- as.factor(figure7$treat)

ggplot(data=figure7, aes(x=year, y=protest,group=treat,shape=treat,color=treat)) + 
  geom_line(aes(linetype=treat), size=1) + 
  scale_linetype_manual(values=c("solid", "dashed", "dotted"))+
  scale_color_manual(values=c("dark red",  "dark blue", "dark blue"))+
  geom_point(size=1,fill="red")+
  scale_x_continuous(name="Year", limits=c(2013,2017), breaks=c(2013,2015,2017)) +
  scale_y_continuous(name="N of protests",limits=c(0,6), breaks=c(0,1,2,3,4,5,6))+
  theme(legend.position="none",axis.text=element_text(size=12),
        axis.title=element_text(size=12,face="bold"))+
  annotate("text", x = 2015.92, y=2.75, label = "Treatment   Group",fontface=2)+
  annotate("text", x = 2015.5, y=1.75, label = "Control Group",fontface=2)+
  annotate("text", x = 2016, y=3.8, label = "Counterfactual",fontface=2)+
  annotate("text", x = 2016.85, y=2.15, label = "DID=-1.384***",fontface=2)
```


## Figure A1-2: Principal Component Analysis Results

```{r}
# remove all the existing variables from R working environment
rm(list=ls(all=TRUE))

## load data
load("./PCAdata.Rda")

cameras <- read.csv(file = './township_camera2018.csv')

data1 <- merge(data, cameras, by.x = "gbtownship", by.y = "gbtownship", all.x = TRUE, all.y = FALSE)

data1[is.na(data1)] <- 0

# saveRDS(data1, file = "data_for_PCA.rds")

library(dplyr)

# drop extra columns
data.sup <- dplyr::select(data1,c(1,2,3,4,5,6,7,8,9))

# Drop 3rd,4th and 5th columns of the dataframe
poi <- dplyr::select(data1,-c(1,2,3,4,5,6,7,8,9))

# Scale data to a mean of 0 and a standard deviation of 1
# scaled.dat <- scale(poi)
data.active <- data1[,10:828]

# area and population
F1 <- names(data.active)[0:8]
# Car related
F2 <- names(data.active)[9:205] 
# restaurants
F3 <- names(data.active)[206:270]
# markets
F4 <- names(data.active)[271:353]
# shops
F5 <- names(data.active)[354:405]
# sports and leisure
F6 <- names(data.active)[406:451]
# health care
F7 <- names(data.active)[452:476]
# hotels
F8 <- names(data.active)[477:484]
# attractions
F9 <- names(data.active)[485:503]
# residences
F10 <- names(data.active)[504:514]
# government organizations
F11 <- names(data.active)[515:523]
# foreign government organizations
F12 <- names(data.active)[524:527]
# social organizations
F13 <- names(data.active)[528:537]
# law enforcement
F14 <- names(data.active)[538:544]
# car management
F15 <- names(data.active)[545:551]
# tax management
F16 <- names(data.active)[552:555]
# schools
F17 <- names(data.active)[556:584]
# transportation
F18 <- names(data.active)[585:625]
# banks
F19 <- names(data.active)[626:724]
# corportates
F20 <- names(data.active)[725:746]
# road infrastructure
F21 <- names(data.active)[747:754]
# place names
F22 <- names(data.active)[755:789]
# public facilities
F23 <- names(data.active)[790:798]
# events
F24 <- names(data.active)[799:809]
# indoor infrastructure
F25 <- names(data.active)[810:818]

# Combine variables
data.active <- dplyr::mutate(data.active, cars = rowSums(dplyr::select(data.active, all_of(F2))))
data.active <- dplyr::mutate(data.active, restaurants = rowSums(dplyr::select(data.active, all_of(F3))))
data.active <- dplyr::mutate(data.active, markets = rowSums(dplyr::select(data.active, all_of(F4))))
data.active <- dplyr::mutate(data.active, shops = rowSums(dplyr::select(data.active, all_of(F5))))
data.active <- dplyr::mutate(data.active, sports = rowSums(dplyr::select(data.active, all_of(F6))))
data.active <- dplyr::mutate(data.active, healths = rowSums(dplyr::select(data.active, all_of(F7))))
data.active <- dplyr::mutate(data.active, hotels = rowSums(dplyr::select(data.active, all_of(F8))))
data.active <- dplyr::mutate(data.active, attractions = rowSums(dplyr::select(data.active, all_of(F9))))
data.active <- dplyr::mutate(data.active, residences = rowSums(dplyr::select(data.active, all_of(F10))))
data.active <- dplyr::mutate(data.active, schools = rowSums(dplyr::select(data.active, all_of(F17))))
data.active <- dplyr::mutate(data.active, transportations = rowSums(dplyr::select(data.active, all_of(F18))))
data.active <- dplyr::mutate(data.active, banks = rowSums(dplyr::select(data.active, all_of(F19))))
data.active <- dplyr::mutate(data.active, corporates = rowSums(dplyr::select(data.active, all_of(F20))))
data.active <- dplyr::mutate(data.active, publicfacs = rowSums(dplyr::select(data.active, all_of(F23))))
extras <- names(data.active)[819:832]


# Generate a new variable “Town Government” = 130105 + 130106
data.active <- mutate(data.active, TownGovernment = rowSums(dplyr::select(data.active, c("X130105", "X130106"))))

# Generate a new variable “Taxation” = 130702 + 130703
data.active <- mutate(data.active, Taxation = rowSums(dplyr::select(data.active, c("X130702", "X130703"))))

# Generate a new variable public health
data.active <- mutate(data.active, PublicHealth = rowSums(dplyr::select(data.active, all_of(F7[1:22]))))

# Generate a new variable public education
data.active <- mutate(data.active, PublicEducation = rowSums(dplyr::select(data.active, c("X140000", "X140100", "X140400", "X140500", "X140600", "X140700", "X140800", "X140900", "X141200", "X141201", "X141202", "X141203", "X141204", "X141205", "X141206"))))

# Generate a new variable public services
data.active <- mutate(data.active, PublicServices = rowSums(dplyr::select(data.active, c("X070200", "X070201", "X070800", "X071902", "X080000", "X080100", "X080108", "X110000", "X110100", "X110101", "X110105", "X150700", "X200000", "X200200", "X200300", "X200400"))))

# Generate a new variable Foreign Direct Investment
data.active <- mutate(data.active, FDI = rowSums(dplyr::select(data.active, c("X010110", "X010111", "X020100", "X020104", "X020105", "X020106", "X020200", "X020203", "X020300", "X020400", "X020401", "X020402", "X020403", "X020404",  "X020600", "X020601", "X020602", "X020700", "X020703", "X020800", "X020900", "X021000", "X021003", "X021100", "X021200", "X021203", "X021300", "X021400", "X021500", "X021501", "X021700", "X021701", "X021800", "X021802", "X021803", "X021900", "X022000", "X022100", "X022301", "X022400", "X022500", "X022501", "X022502", "X022600", "X022700", "X022800", "X026000", "X026300", "X030204", "X030205", "X030206", "X030300", "X030303", "X030400", "X030500", "X030502", "X030503", "X030700", "X030702", "X030800", "X030801", "X030900", "X031000", "X031100", "X031103", "X031200", "X031300", "X031303", "X031400", "X031500", "X031600", "X031601", "X031701", "X031800", "X031801", "X031900", "X031902", "X031903", "X032000", "X032100", "X032200", "X032401", "X032600", "X032601", "X032700", "X032800", "X035400", "X036000", "X036300", "X039900", "X040101", "X040201", "X050310", "X050501", "X050504", "X060201", "X060202", "X060401", "X060402", "X060406", "X060407", "X060409", "X060413", "X060414", "X060415", "X160124", "X160125", "X160126", "X160127", "X160130", "X160131", "X160133", "X160134", "X160135", "X160136", "X160137", "X160138", "X160331", "X160334"))))


A3.dat <- mutate(data.active, Business = rowSums(dplyr::select(data.active, c(F2, F3, F4, F5, F6, F7, F8, F9, F10, F17, F18, F19, F20, F23))))
A3.dat <- dplyr::select(all_of(A3.dat), c(F1, F11, F12, F13, F14, F15, F16, Business, extras, "TownGovernment", "Taxation"))
# A3.dat <- A3.dat[0:50]

# A3 <- c("area", "a100001",  "TownGovernment", "X130501", "X130502", "X130503", "X130505", "X130506", "X130701", "Taxation", "Business")
A3 <- c("a100001",  "TownGovernment", "X130501", "X130502", "X130503", "X130505", "X130506", "X130701", "Taxation", "Business")
A3.dat <- dplyr::select(all_of(A3.dat), A3)

# Take out population and economic organizations
A2 <- c("TownGovernment", "X130501", "X130502", "X130503", "X130505", "X130506", "X130701", "Taxation")
A2.dat <- dplyr::select(all_of(A3.dat), A2)


# Analysis 2

A2.dat <- A2.dat %>% rename(TownGovernment = TownGovernment,
                            Police = X130501,
                            Procuratorate = X130502,
                            Court = X130503,
                            Notary = X130505,
                            StabilityMaintenance = X130506,
                            Industry_Commerce = X130701,
                            Taxation = Taxation
                            )

pca_res2 <- prcomp(A2.dat, scale. = TRUE)

# Extract PC axes for plotting
PCAvalues2 <- data.frame(pca_res2$x)

# Extract loadings of the variables
PCAloadings2 <- data.frame(Variables = rownames(pca_res2$rotation), pca_res2$rotation)
# PCAloadings2 <- data.frame(Variables = rownames(pca_res2), pca_res2)

# export loadings to tables
library(kableExtra)
# tables(PCAloadings2$Variables, PCAloadings2$PC1)

# show table of Principal Component loadings
PCAloadings2 %>% kbl() %>% kable_classic(full_width = F, html_font = "Cambria")

# plot PCA loadings

# pdf("./Analysis2_POIloadings.pdf")
# autoplot(pca_res2, data = A2.dat, label.color = 'black', 
#          loadings = TRUE, loadings.colour = 'azure4',
#          loadings.label = TRUE, loadings.label.size = 3) +
#   theme_bw() 
# dev.off()

# plot PCA loadings with annotations

# pdf("./Analysis2_POIloadings_v1.pdf")
p <- ggplot(PCAvalues2, aes(x = PC1, y = PC2)) +
  geom_segment(data = PCAloadings2, aes(x = 0, y = 0, xend = (PC1*50),
     yend = (-PC2*50)), arrow = arrow(length = unit(1/2, "picas")),
     color = "black") +
  geom_point(size = 2) +
  annotate("text", x = c(20.61785, 18.95607, 20.96302, 17.16181, 19.70190, 19.93425, 24.14218, 21.91258), y = c(-17.535826, -21.568843, -6.627428, 39.292237, 17.011017,  -11.659695,  3.090667,  1.587598),
     label = c("Notary", "Stability Maintenance", "Police", "Procuratorate", "Court", "Town Government", "Industry and Commerce", "Taxation")) +
  theme(
    panel.background = element_rect(fill = "transparent"), # bg of the panel
    # plot.background = element_rect(fill = "transparent", color = NA), # bg of the plot
    # panel.grid.major = element_blank(), # get rid of major grid
    # panel.grid.minor = element_blank(), # get rid of minor grid
    # legend.background = element_rect(fill = "transparent"), # get rid of legend bg
    # legend.box.background = element_rect(fill = "transparent") # get rid of legend panel bg
  )
p
# dev.off()


library(factoextra)
# Print the plot to a pdf file
# pdf("./Analysis2_scree_plot.pdf")

scree_plot <- fviz_eig(pca_res2, addlabels = TRUE, ylim = c(0, 90))
print(scree_plot)
# dev.off()

```

## Figure 4: Public Spaces and Police Stations across Chinese Townships (2018)
```{r}
rm(list=ls(all=TRUE))

## load data
load("./PCAdata.Rda")

cameras <- read.csv(file = './township_camera2018.csv')

data1 <- merge(data, cameras, by.x = "gbtownship", by.y = "gbtownship", all.x = TRUE, all.y = FALSE)

data1[is.na(data1)] <- 0

library(dplyr)


# Generate a new variable for public services, which is used in Figure 4a
data.mapping <- mutate(data1, PublicServices = rowSums(dplyr::select(data1, c("X070200", "X070201", "X070800", "X071902", "X080000", "X080100", "X080108", "X110000", "X110100", "X110101", "X110105", "X150700", "X200000", "X200200", "X200300", "X200400"))))


# Select a subset of variables for analysis
data.mapping <- dplyr::select(data.mapping, c("gbtownship", "townshipen", "gbcounty","area", "a100001", "X130501", "PublicServices"))

# Renaming columns
data.mapping  <- data.mapping  %>% rename(Police = X130501,
                                          Population = a100001,
                                          County = gbcounty,
                                          Area = area)


```


## Table 2: Public Spaces and Police Stations: OLS Estimates with Township-Level Data
```{r}
# remove all the existing variables from R working environment
rm(list=ls(all=TRUE))

# fixed effects model
library(plm)
library(fixest)
library(lmtest)

data1 <- readRDS(file = "./data_for_PCA.rds")

data.mapping <- mutate(data1, PublicServices = rowSums(dplyr::select(data1, c("X070200", "X070201", "X070800", "X071902", "X080000", "X080100", "X080108", "X110000", "X110100", "X110101", "X110105", "X150700", "X200000", "X200200", "X200300", "X200400"))))

data.mapping <- dplyr::select(data.mapping, c("gbtownship", "townshipen", "gbcounty","area", "a100001", "X130501", "PublicServices"))

# Renaming columns
data.mapping  <- data.mapping  %>% rename(Police = X130501,
                                          Population = a100001,
                                          County = gbcounty,
                                          Area = area)

# Log Tranformation
data.mapping$logPolice <- log(data.mapping$Police + 1)
data.mapping$logPublicServices <- log(data.mapping$PublicServices + 1)
data.mapping$logPopulation <- log(data.mapping$Population + 1)
data.mapping$logArea <- log(data.mapping$Area + 1)

# basemodel
ols1 = feols(logPolice ~ logPublicServices | County, data.mapping)

# control population
ols2 = feols(logPolice ~ logPublicServices + logPopulation | County, data.mapping)

# control land area
ols3 = feols(logPolice ~ logPublicServices + logPopulation  + logArea | County, data.mapping)

# show statistical table
etable(ols1, ols2, ols3, 
       dict=c("logPublicServices" = "N of Public Services (log)", "logPopulation" = "Population (log)", "logArea" = "Area (log)"),
       vcov = "twoway", headers = c("(1)", "(2)", "(3)"))

# generate latex code
# etable(ols1, ols2, ols3,
#        vcov = "twoway", headers = c("(1)", "(2)", "(3)"), tex = TRUE)
```


## Figure A1-4: Locations of State Agencies in Dongcheng District (Beijing)

```{r}

# See dongcheng_pois.gpkg

```


## Table A1-3: Coercion and Protest: DID Estimates with County-Level Data
```{r}
# See Tables A1-3 and A1-4 and A1-6.do
```

## Figure A1-3: Correlations between State Agencies at Different Levels
```{r}
rm(list=ls(all=TRUE))

# load  data
poi_data <- readRDS(file = "./poi_data.rds")

# select data at national, provincial, prefectural, county level
B1.dat <- dplyr::select(poi_data, all_of(c("X130101","X130102","X130103","X130104")))

# replace 0 with NA
B1.dat[B1.dat == 0] <- NA

# pdf("./Analysis8_corrplot_levels_gov.pdf")
# ggpairs(A1.dat, title="correlogram with ggpairs()") 
g <- ggpairs(B1.dat,columnLabels = c("National-Level Agencies","Provincial-Level Agencies","Prefectural-Level Agencies","County-Level Agencies"), axisLabels = "none") 

print(g)


# dev.off()

```



## Table A1-4: State Agencies and Protest: DID Estimates with County-Level Data
```{r}
# See Tables A1-3 and A1-4 and A1-6.do
```


## Figure A1-5: State Agencies and Missingness in Official Statistics

```{r}
rm(list=ls(all=TRUE))

# load Gaode 2018 data
gd18 <- readRDS(file = "./gd18.rds")

# Code missing data
gd18$PopNA <- is.na(gd18$pop_million)
gd18$GDPNA <- is.na(gd18$gdp_millionyuan)
gd18$TaxNA <- is.na(gd18$taxrevenue_million)
gd18$SpendingNA <- is.na(gd18$spending_millionyuan)


A1_5.dat <- dplyr::select(gd18, c("PopNA", "GDPNA", "TaxNA", "SpendingNA", "govperpop"))

# Renaming columns
A1_5.dat <- A1_5.dat %>% rename(Population = PopNA,
                            GDP = GDPNA,
                            Tax = TaxNA,
                            Spending = SpendingNA
                            )

library(tidyr)

data_long <- tidyr::gather(A1_5.dat, condition, missing, Population:Spending, factor_key=TRUE)


# pdf("./Analysis8_boxplot_v1.pdf")
# grouped boxplot
ggplot(data_long, aes(x=condition, y=govperpop, fill=missing)) + 
    geom_boxplot(outlier.shape = NA) + ylab("Per Capita State") + 
    xlab(" ") +
    scale_y_continuous(limits = c(0, 0.0015)) +
    guides(fill=guide_legend(title="Missing Value")) +
    scale_fill_manual(values=c("#f0f0f0", "#636363"))
# dev.off()


```



## Figure A1-6: Number of State Agencies around the World
```{r}
# See worldbank_osm_gov.xls
```


## Figure A1-7: Correlation Plots of Cross-National Indicators
```{r}
# Data and code
```

## Figure A1-8: Economies of Scale in State Administration across Countries
```{r}
rm(list=ls(all=TRUE))

library(foreign)
library(ggplot2)

figureA1.8 <- read.dta("./cross-national data for R.dta")


ggplot(figureA1.8, aes(x=pop_log, y=pc_num_gov_log)) + 
  geom_point(shape=19, color="violetred4", size=3)+
  geom_smooth(method=lm,  se=TRUE,linetype="solid",
              color="deepskyblue4", fill="deepskyblue2")+
  scale_y_continuous(name="Per Capita State Agencies (log)",limits=c(0,8), breaks=c(0,2,4,6,8))+
  scale_x_continuous(name="Population (log)", limits=c(0,8), breaks=c(0,2,4,6,8))+
  annotate("text", x=7, y=4.5, label= "Slope = -0.300") + 
  theme_bw()
```

## Table A1-6: Patterns of Missingness in World Bank Data: OLS Estimates

```{r}
# See Tables A1-3 and A1-4 and A1-6.do
```



## Figure A1-9: Correlation Plot of State Infrastructure and State Capacity Measuress

```{r}
rm(list=ls(all=TRUE))

library(tidyverse)
library(readxl)

# read data

osm_ctry <- read.csv(file = './OSM_gov_country.csv')

pwt <- read_excel("./OSM_PWT_VDEM_POLITY.xlsx")

osm_id_country <- read_excel("./osm_id_country.xlsx")

stateCapacity <- read.csv(file = './cross-national_capacity_measures.csv')

# data processing

osm_gov1 <- merge(osm_ctry, pwt, by.x = "osm_id", by.y = "osm_id")

osm_gov2 <- merge(osm_gov1, stateCapacity, by.x = "countrycode", by.y = "CountryCode")

# summary(osm_gov)
osm_gov <- merge(osm_gov2, osm_id_country, by.x = "osm_id", by.y = "osm_id")

GovPOI <- c("countrycode", "area", "num_gov", "rgdpna", "pop", "v2x_polyarchy", "polity2", "v2x_polyarchy", "Year","V.Dem_country_code", "CapacityHansonSigman", "FiscalCapacity", "StateAntiquityIndex", "StatenessIndex", "Weberiannes", "WGI", "Hendrix", "FragileStatesIndex", "Taxrevenue", "logMyersIndex")

osmPOI <- select(osm_gov, all_of(GovPOI))

osmPOI <- osmPOI %>% mutate(POIpc = num_gov/(pop+1))

osmPOI <- osmPOI %>% mutate(GDPpc = rgdpna/(pop))

osmPOI <- osmPOI %>% mutate(Areapc = area/(pop))

GovPOI2 <- c("num_gov", "POIpc", "rgdpna", "GDPpc", "polity2")

osmPOI2 <- select(osmPOI, all_of(GovPOI2))

library(dplyr)
library(tidyr)

# Subset Rows by Checking values on Multiple Columns
df <- osmPOI[osmPOI$Year > 2006,]

# Sort by column index [1] then [3]
df <- df[
  order( df[,1], df[,8] ),
]

# Fill missing value
df <- df %>%
  group_by(countrycode) %>%
  fill(logMyersIndex) %>%
  fill(logMyersIndex, .direction = "up")

df <- df %>%
  group_by(countrycode) %>%
  fill(WGI) %>%
  fill(WGI, .direction = "up")

df <- df %>%
  group_by(countrycode) %>%
  fill(FragileStatesIndex) %>%
  fill(FragileStatesIndex, .direction = "up")

df <- df %>%
  group_by(countrycode) %>%
  fill(CapacityHansonSigman) %>%
  fill(CapacityHansonSigman, .direction = "up")

df <- df %>%
  group_by(countrycode) %>%
  fill(StateAntiquityIndex) %>%
  fill(StateAntiquityIndex, .direction = "up")

df <- df %>%
  group_by(countrycode) %>%
  fill(Taxrevenue) %>%
  fill(Taxrevenue, .direction = "up")


df <- df %>%
  group_by(countrycode) %>%
  summarise_at(vars(c("num_gov", "POIpc", "area", "rgdpna", "pop", "CapacityHansonSigman", "FiscalCapacity", "StateAntiquityIndex", "StatenessIndex", "Weberiannes", "WGI", "Hendrix", "FragileStatesIndex", "Taxrevenue", "logMyersIndex")), list(mean))

# select variable for correlation

V1.dat <- dplyr::select(df, c("POIpc", "CapacityHansonSigman", "StateAntiquityIndex", "WGI", "FragileStatesIndex", "Taxrevenue", "logMyersIndex"))


# summary(V1.dat)


library(ggplot2)
library(GGally)

# pdf("./corrplot_osm4.pdf")

g <- ggpairs(V1.dat, columnLabels = c("OSM","Hanson/Sigman", "State Antiquity", "WGI", "Fragile State", "Tax/GDP", "Myers Index"), axisLabels = "none")
print(g)

# dev.off()

```


Regression
```{r}
lm1 <- lm(polity2 ~ log(GDPpc+1) + log((num_gov + 1)/(pop+1)) + log(pop+1), data = osmPOI)

library(stargazer)

stargazer(lm1,  
          type = "text", ci.level = 0.95, star.cutoffs = c(0.05, 0.01, 0.001),
          dep.var.labels = c("Polity Score"),
          covariate.labels = c("Per Capita GDP (log)", "N of state agencies per million (log)" , "Population (log)"))


closeAllConnections() # Close connection to log file
```




